- 
                Notifications
    You must be signed in to change notification settings 
- Fork 14.9k
[VPlan] Unroll VPReplicateRecipe by VF. #142433
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
| @llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesPatch is 33.11 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142433.diff 16 Files Affected: 
 diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e9ace195684b3..beeab51e0806a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7557,6 +7557,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // cost model is complete for better cost estimates.
   VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
                            OrigLoop->getHeader()->getContext());
+  VPlanTransforms::runPass(VPlanTransforms::unrollByVF, BestVPlan, BestVF);
   VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
   VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 165b57c87beb1..c09970ef54103 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -261,6 +261,14 @@ Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) {
     return Data.VPV2Scalars[Def][0];
   }
 
+  // Look through BuildVector to avoid redundant extracts.
+  // TODO: Remove once replicate regions are unrolled explicitly.
+  auto *BV = dyn_cast<VPInstruction>(Def);
+  if (Lane.getKind() == VPLane::Kind::First && BV &&
+      BV->getOpcode() == VPInstruction::BuildVector) {
+    return get(BV->getOperand(Lane.getKnownLane()), true);
+  }
+
   assert(hasVectorValue(Def));
   auto *VecPart = Data.VPV2Vector[Def];
   if (!VecPart->getType()->isVectorTy()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 44f0b6d964a6e..cd0ee979c5943 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -907,6 +907,12 @@ class VPInstruction : public VPRecipeWithIRFlags,
     BranchOnCount,
     BranchOnCond,
     Broadcast,
+    /// Creates a vector containing all operands. The vector element count
+    /// matches the number of operands.
+    BuildVector,
+    /// Creates a struct of vectors containing all operands. The vector element
+    /// count matches the number of operands.
+    BuildStructVector,
     ComputeFindLastIVResult,
     ComputeReductionResult,
     // Extracts the last lane from its operand if it is a vector, or the last
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 926490bfad7d0..66df7e3ebf802 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -104,6 +104,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::AnyOf:
+  case VPInstruction::BuildVector:
+  case VPInstruction::BuildStructVector:
     return SetResultTyFromOp();
   case VPInstruction::FirstActiveLane:
     return Type::getIntNTy(Ctx, 64);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index a4831ea7c11f7..69b49430b6659 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -493,6 +493,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
   }
   case Instruction::ExtractElement: {
     assert(State.VF.isVector() && "Only extract elements from vectors");
+    return State.get(getOperand(0),
+                     VPLane(cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
+                                ->getZExtValue()));
     Value *Vec = State.get(getOperand(0));
     Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
     return Builder.CreateExtractElement(Vec, Idx, Name);
@@ -604,6 +607,34 @@ Value *VPInstruction::generate(VPTransformState &State) {
     return Builder.CreateVectorSplat(
         State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
   }
+  case VPInstruction::BuildVector: {
+    auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
+    Value *Res = PoisonValue::get(
+        toVectorizedTy(ScalarTy, ElementCount::getFixed(getNumOperands())));
+    for (const auto &[Idx, Op] : enumerate(operands()))
+      Res = State.Builder.CreateInsertElement(Res, State.get(Op, true),
+                                              State.Builder.getInt32(Idx));
+    return Res;
+  }
+  case VPInstruction::BuildStructVector: {
+    // For struct types, we need to build a new 'wide' struct type, where each
+    // element is widened.
+    auto *STy =
+        cast<StructType>(State.TypeAnalysis.inferScalarType(getOperand(0)));
+    Value *Res = PoisonValue::get(
+        toVectorizedTy(STy, ElementCount::getFixed(getNumOperands())));
+    for (const auto &[Idx, Op] : enumerate(operands())) {
+      for (unsigned I = 0, E = STy->getNumElements(); I != E; I++) {
+        Value *ScalarValue = Builder.CreateExtractValue(State.get(Op, true), I);
+        Value *VectorValue = Builder.CreateExtractValue(Res, I);
+        VectorValue =
+            Builder.CreateInsertElement(VectorValue, ScalarValue, Idx);
+        Res = Builder.CreateInsertValue(Res, VectorValue, I);
+      }
+    }
+    return Res;
+  }
+
   case VPInstruction::ComputeFindLastIVResult: {
     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
     // and will be removed by breaking up the recipe further.
@@ -864,10 +895,11 @@ void VPInstruction::execute(VPTransformState &State) {
   if (!hasResult())
     return;
   assert(GeneratedValue && "generate must produce a value");
-  assert(
-      (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly ||
-       State.VF.isScalar()) &&
-      "scalar value but not only first lane defined");
+  assert((((GeneratedValue->getType()->isVectorTy() ||
+            GeneratedValue->getType()->isStructTy()) ==
+           !GeneratesPerFirstLaneOnly) ||
+          State.VF.isScalar()) &&
+         "scalar value but not only first lane defined");
   State.set(this, GeneratedValue,
             /*IsScalar*/ GeneratesPerFirstLaneOnly);
 }
@@ -881,6 +913,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case Instruction::ICmp:
   case Instruction::Select:
   case VPInstruction::AnyOf:
+  case VPInstruction::BuildVector:
+  case VPInstruction::BuildStructVector:
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::ExtractLastElement:
@@ -999,6 +1033,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::Broadcast:
     O << "broadcast";
     break;
+  case VPInstruction::BuildVector:
+    O << "buildvector";
+    break;
+  case VPInstruction::BuildStructVector:
+    O << "buildstructvector";
+    break;
   case VPInstruction::ExtractLastElement:
     O << "extract-last-element";
     break;
@@ -2758,20 +2798,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
     scalarizeInstruction(UI, this, VPLane(0), State);
     return;
   }
-
-  // A store of a loop varying value to a uniform address only needs the last
-  // copy of the store.
-  if (isa<StoreInst>(UI) && vputils::isSingleScalar(getOperand(1))) {
-    auto Lane = VPLane::getLastLaneForVF(State.VF);
-    scalarizeInstruction(UI, this, VPLane(Lane), State);
-    return;
-  }
-
-  // Generate scalar instances for all VF lanes.
-  assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
-  const unsigned EndLane = State.VF.getKnownMinValue();
-  for (unsigned Lane = 0; Lane < EndLane; ++Lane)
-    scalarizeInstruction(UI, this, VPLane(Lane), State);
 }
 
 bool VPReplicateRecipe::shouldPack() const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 5b42a9056b69e..d2c17b7f52b76 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1142,6 +1142,22 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
+  // Look through Extract(Last|Penultimate)Element (BuildVector ....).
+  if (match(&R,
+            m_VPInstruction<VPInstruction::ExtractLastElement>(m_VPValue(A))) ||
+      match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
+                    m_VPValue(A)))) {
+    unsigned Offset = cast<VPInstruction>(&R)->getOpcode() ==
+                              VPInstruction::ExtractLastElement
+                          ? 1
+                          : 2;
+    auto *BV = dyn_cast<VPInstruction>(A);
+    if (BV && BV->getOpcode() == VPInstruction::BuildVector) {
+      Def->replaceAllUsesWith(BV->getOperand(BV->getNumOperands() - Offset));
+      return;
+    }
+  }
+
   // Some simplifications can only be applied after unrolling. Perform them
   // below.
   if (!Plan->isUnrolled())
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 34e2de4eb3b74..f45b7a7969d04 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -99,6 +99,10 @@ struct VPlanTransforms {
   /// Explicitly unroll \p Plan by \p UF.
   static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx);
 
+  /// Explicitly unroll VPReplicateRecipes outside of replicate regions by \p
+  /// VF.
+  static void unrollByVF(VPlan &Plan, ElementCount VF);
+
   /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
   /// resulting plan to \p BestVF and \p BestUF.
   static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index e1fb3d476c58d..331b395f30490 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -15,6 +15,7 @@
 #include "VPlan.h"
 #include "VPlanAnalysis.h"
 #include "VPlanCFG.h"
+#include "VPlanHelpers.h"
 #include "VPlanPatternMatch.h"
 #include "VPlanTransforms.h"
 #include "VPlanUtils.h"
@@ -428,3 +429,83 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
 
   VPlanTransforms::removeDeadRecipes(Plan);
 }
+
+/// Create a single-scalar clone of RepR for lane \p Lane.
+static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
+                                       Type *IdxTy, VPReplicateRecipe *RepR,
+                                       VPLane Lane) {
+  // Collect the operands at Lane, creating extracts as needed.
+  SmallVector<VPValue *> NewOps;
+  for (VPValue *Op : RepR->operands()) {
+    if (vputils::isSingleScalar(Op)) {
+      NewOps.push_back(Op);
+      continue;
+    }
+    VPValue *Ext;
+    if (Lane.getKind() == VPLane::Kind::ScalableLast) {
+      Ext = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op});
+    } else {
+      // Look through buildvector to avoid unnecessary extracts.
+      auto *BV = dyn_cast<VPInstruction>(Op);
+      if (BV && BV->getOpcode() == VPInstruction::BuildVector) {
+        NewOps.push_back(BV->getOperand(Lane.getKnownLane()));
+        continue;
+      }
+      VPValue *Idx =
+          Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
+      Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
+    }
+    NewOps.push_back(Ext);
+  }
+
+  auto *New =
+      new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
+                            /*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
+  New->insertBefore(RepR);
+  return New;
+}
+
+void VPlanTransforms::unrollByVF(VPlan &Plan, ElementCount VF) {
+  Type *IdxTy = IntegerType::get(
+      Plan.getScalarHeader()->getIRBasicBlock()->getContext(), 32);
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
+    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+      auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
+      if (!RepR || RepR->isSingleScalar())
+        continue;
+
+      VPBuilder Builder(RepR);
+      SmallVector<VPValue *> LaneDefs;
+      // Stores to invariant addresses only need to store the last lane.
+      if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
+          vputils::isSingleScalar(RepR->getOperand(1))) {
+        cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF));
+        RepR->eraseFromParent();
+        continue;
+      }
+
+      /// Create single-scalar version of RepR for all lanes.
+      for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
+        LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)));
+
+      /// Users that only demand the first lane can use the definition for lane
+      /// 0.
+      RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) {
+        return U.onlyFirstLaneUsed(RepR);
+      });
+
+      Type *ResTy = RepR->getUnderlyingInstr()->getType();
+      // If needed, create a Build(Struct)Vector recipe to insert the scalar
+      // lane values into a vector.
+      if (!ResTy->isVoidTy()) {
+        VPValue *VecRes = Builder.createNaryOp(
+            ResTy->isStructTy() ? VPInstruction::BuildStructVector
+                                : VPInstruction::BuildVector,
+            LaneDefs);
+        RepR->replaceAllUsesWith(VecRes);
+      }
+      RepR->eraseFromParent();
+    }
+  }
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index 83e9d6146755d..743aedee38012 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -398,12 +398,6 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x float> [[STRIDED_VEC]], i32 3
 ; CHECK-NEXT:    store float [[TMP30]], ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 0
-; CHECK-NEXT:    [[TMP38:%.*]] = load float, ptr [[TMP31]], align 4
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 1
-; CHECK-NEXT:    [[TMP32:%.*]] = load float, ptr [[TMP33]], align 4
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 2
-; CHECK-NEXT:    [[TMP34:%.*]] = load float, ptr [[TMP35]], align 4
 ; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 3
 ; CHECK-NEXT:    [[TMP36:%.*]] = load float, ptr [[TMP37]], align 4
 ; CHECK-NEXT:    store float [[TMP36]], ptr [[B:%.*]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
index cdc7839bfc0f0..95258e65bbe3d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
@@ -32,42 +32,31 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 104
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 112
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 120
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP0]]
+; CHECK-NEXT:    [[NEXT_GEP12:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP3]]
-; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP4]]
+; CHECK-NEXT:    [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP7]]
-; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP8]]
+; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP10]]
-; CHECK-NEXT:    [[NEXT_GEP12:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP11]]
-; CHECK-NEXT:    [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP12]]
-; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP13]]
-; CHECK-NEXT:    [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP14]]
+; CHECK-NEXT:    [[NEXT_GEP17:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP11]]
+; CHECK-NEXT:    [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP12]]
+; CHECK-NEXT:    [[NEXT_GEP18:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP13]]
+; CHECK-NEXT:    [[NEXT_GEP19:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 4
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 4
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 4
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP4]], i64 4
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 4
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[NEXT_GEP6]], i64 4
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP7]], i64 4
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[NEXT_GEP8]], i64 4
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP9]], i64 4
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[NEXT_GEP10]], i64 4
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[NEXT_GEP11]], i64 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[NEXT_GEP12]], i64 4
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[NEXT_GEP13]], i64 4
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[NEXT_GEP14]], i64 4
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[NEXT_GEP15]], i64 4
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[NEXT_GEP16]], i64 4
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[TMP16]], i32 -4
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP20]], i32 -4
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[TMP24]], i32 -4
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[TMP28]], i32 -4
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[TMP27]], i32 -4
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP28]], i32 -4
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[TMP29]], i32 -4
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[TMP30]], i32 -4
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP32]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC17:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -85,7 +74,7 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 {
 ; CHECK-NEXT:    [[TMP38:%.*]] = add <4 x i32> [[STRIDED_VEC23]], [[STRIDED_VEC22]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = add <4 x i32> [[STRIDED_VEC26]], [[STRIDED_VEC25]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[TMP36]], i32 0
-; CHECK-NEXT:    store i32 [[TMP40]], ptr [[NEXT_GEP]], align 4
+; CHECK-NEXT:    store i32 [[TMP40]], ptr [[NEXT_GEP12]], align 4
 ; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i32> [[TMP36]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP41]], ptr [[NEXT_GEP2]], align 4
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[TMP36]], i32 2
@@ -93,7 +82,7 @@ define ptr @test_in...
[truncated]
 | 
85cb01c    to
    c354bdc      
    Compare
  
    There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping :)
| // cost model is complete for better cost estimates. | ||
| VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF, | ||
| OrigLoop->getHeader()->getContext()); | ||
| VPlanTransforms::runPass(VPlanTransforms::unrollByVF, BestVPlan, BestVF); | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the general idea that by unrolling the replicate recipes we open up more folding/dce opportunities within vplan, to reduce the burden on dce/instcombine?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Exposing more folding opportunities is a nice benefit, although besides that I think the main benefits are simpler recipe ::execute and simpler state management (VPTransformState).
It also encourages more flexible/general recipes and adds utilities that can be useful in the future, e.g. BuildVector for SLP and potentially also allows more accurate cost estimates, by making vector <-> scalar transitions explicit.
| BranchOnCount, | ||
| BranchOnCond, | ||
| Broadcast, | ||
| /// Creates a vector containing all operands. The vector element count | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How does this work with scalable vectors? It sounds very similar to the BuildVector ISD node in codegen. If it's not expected to work for scalable element counts it's worth spelling it out explicitly in the comments I think.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep scalable vectors aren't supported, updated to say Creates a fixed-width vector ..., thanks
| /// Creates a vector containing all operands. The vector element count | ||
| /// matches the number of operands. | ||
| BuildVector, | ||
| /// Creates a struct of vectors containing all operands. The vector element | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same as above.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also added fixed-width vectors here
| return State.get(getOperand(0), | ||
| VPLane(cast<ConstantInt>(getOperand(1)->getLiveInIRValue()) | ||
| ->getZExtValue())); | ||
| Value *Vec = State.get(getOperand(0)); | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why are lines 499-501 not deleted, given that we've already returned on line 496?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep removed, thanks
| ? 1 | ||
| : 2; | ||
| auto *BV = dyn_cast<VPInstruction>(A); | ||
| if (BV && BV->getOpcode() == VPInstruction::BuildVector) { | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you use match here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added a m_BuildVector matcher, which for which we don't support matching the operands, which is variable. That seems to work OK with @ayalz's suggestion below.
| ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]] | ||
| ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]] | ||
| ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]] | ||
| ; CHECK-NEXT: [[TMP35:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP5]], i32 0 | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks worse than before I think. Seems like there is no user of TMP38.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, this is a case we won't be able to handle until we also unroll replicate regions by VF. There is a use in the VPReplicateRecipe in a replicate region, which prevents us from removing the BuildVector, even though during codegen we don't use it.
| // cost model is complete for better cost estimates. | ||
| VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF, | ||
| OrigLoop->getHeader()->getContext()); | ||
| VPlanTransforms::runPass(VPlanTransforms::unrollByVF, BestVPlan, BestVF); | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it reasonable to unroll by VF before unrolling by UF rather than afterwards? BestVF is conceptually chosen before BestUF.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The only reason to run it after unrolling by UF is that the current position matches the order VPReplicateRecipes currently generate code, so less test changes.
| return Data.VPV2Scalars[Def][0]; | ||
| } | ||
|  | ||
| // Look through BuildVector to avoid redundant extracts. | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is done now rather than later to reduce test diff?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep without the fold we would have additional insert/extracts for uses inside replicate regions, with corresponding test changes.
|  | ||
| // Look through BuildVector to avoid redundant extracts. | ||
| // TODO: Remove once replicate regions are unrolled explicitly. | ||
| auto *BV = dyn_cast<VPInstruction>(Def); | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| auto *BV = dyn_cast<VPInstruction>(Def); | |
| auto *BuildV = dyn_cast<VPInstruction>(Def); | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done thanks!
| // TODO: Remove once replicate regions are unrolled explicitly. | ||
| auto *BV = dyn_cast<VPInstruction>(Def); | ||
| if (Lane.getKind() == VPLane::Kind::First && BV && | ||
| BV->getOpcode() == VPInstruction::BuildVector) { | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Worth matching?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Matched using a matcher w/o ops, thanks.
| /// Creates a vector containing all operands. The vector element count | ||
| /// matches the number of operands. | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// Creates a vector containing all operands. The vector element count | |
| /// matches the number of operands. | |
| /// Creates a vector containing all operands. The number of operands | |
| /// matches the vector element count. | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done thanks
| VPlanTransforms::removeDeadRecipes(Plan); | ||
| } | ||
|  | ||
| /// Create a single-scalar clone of RepR for lane \p Lane. | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// Create a single-scalar clone of RepR for lane \p Lane. | |
| /// Create a single-scalar clone of \p RepR for lane \p Lane. | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done thanks
| /// Explicitly unroll \p Plan by \p UF. | ||
| static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx); | ||
|  | ||
| /// Explicitly unroll VPReplicateRecipes outside of replicate regions by \p | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps converting replicate recipes to (or replacing with) sets of clone recipes per lane would be more accurate than "unroll".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated thanks
| } | ||
| VPValue *Ext; | ||
| if (Lane.getKind() == VPLane::Kind::ScalableLast) { | ||
| Ext = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}); | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| Ext = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}); | |
| Ext = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}); | |
| NewOps.push_back(Ext); | |
| continue; | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done thanks
| } | ||
|  | ||
| auto *New = | ||
| new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps, | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could this be a VPInstruction?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
|  | ||
| VPBuilder Builder(RepR); | ||
| SmallVector<VPValue *> LaneDefs; | ||
| // Stores to invariant addresses only need to store the last lane. | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| // Stores to invariant addresses only need to store the last lane. | |
| // Stores to invariant addresses need to store the last lane only. | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done thanks
| ✅ With the latest revision this PR passed the C/C++ code formatter. | 
|  | ||
| VPBuilder Builder(RepR); | ||
| SmallVector<VPValue *> LaneDefs; | ||
| // Stores to invariant addresses need to store the last lane only. | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, should this be considered RepR->isSingleScalar() too and/or converted to storing last lane only independent of replicatingByVF(), possibly using ExtractLast?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, should be able to move this to convertToSingleScalar.
| /// Replace replicating VPReplicateRecipes outside replicate regions in \p | ||
| /// Plan with \p VF single-scalar recipes. | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// Replace replicating VPReplicateRecipes outside replicate regions in \p | |
| /// Plan with \p VF single-scalar recipes. | |
| /// Replace each VPReplicateRecipe outside on any replicate region in \p Plan | |
| /// with \p VF single-scalar recipes. | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
|  | ||
| /// Replace replicating VPReplicateRecipes outside replicate regions in \p | ||
| /// Plan with \p VF single-scalar recipes. | ||
| /// TODO: Also unroll VPReplicateRegions by VF. | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// TODO: Also unroll VPReplicateRegions by VF. | |
| /// TODO: Also replicate VPReplicateRecipes inside replicate regions, thereby | |
| /// dissolving the latter. | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated thanks
| Type *ResTy = RepR->getUnderlyingInstr()->getType(); | ||
| // If needed, create a Build(Struct)Vector recipe to insert the scalar | ||
| // lane values into a vector. | ||
| if (!ResTy->isVoidTy()) { | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If ResTy is void (better check for empty users instead?) suffice to clone for lanes and erase from parent, w/o populating LaneDefs, handling all stores early following those to a single address.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done thanks
| /// Creates a struct of fixed-width vectors containing all operands. The | ||
| /// number of operands | ||
| /// matches the number of fields in the struct. | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// Creates a struct of fixed-width vectors containing all operands. The | |
| /// number of operands | |
| /// matches the number of fields in the struct. | |
| /// Given operands of (the same) struct type, creates a struct of fixed- | |
| /// width vectors each containing a struct field of all operands. The | |
| /// number of operands matches the element count of every vector. | 
(Strictly speaking, the vectors created contain the fields of the operands, rather than the complete operands, in contrast to BuildVector.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
| for (const auto &[Idx, Op] : enumerate(operands())) { | ||
| for (unsigned I = 0; I != NumOfElements.getKnownMinValue(); I++) { | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Both loops seem to always have the same trip count, namely the number of operands (i.e., width of each resulting vector), but one (I) should be the number of fields in the struct (i.e., number of resulting vectors)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yep the inner loop needs to process StructTy->getNumElements, which may be different than the number of operands.
| } | ||
| // Look through ExtractPenultimateElement (BuildVector ....). | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| } | |
| // Look through ExtractPenultimateElement (BuildVector ....). | |
| } | |
| // Look through ExtractPenultimateElement (BuildVector ....). | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done thanks
| ; // Lane 0 | ||
| ; CHECK: [[A_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 0 | ||
| ; CHECK: [[VEC_A_0:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 | ||
| ; CHECK: [[VEC_A_0:%.*]] = insertelement <2 x float> poison, float [[A_0]], i64 0 | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are there other BuildStructVector tests? Here number of fields = VF = 2.
@struct_return_i32_three_results_widen below works w/o any change, because it works w/o replication?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes we are indeed missing a case where we replicate more lanes than VF (or less). Will add separately.
| // Insert scalar instance packing it into a vector. | ||
| if (State.VF.isVector() && shouldPack()) { | ||
| // If we're constructing lane 0, initialize to start from poison. | ||
| if (State.Lane->isFirstLane()) { | ||
| assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); | ||
| Value *Poison = | ||
| PoisonValue::get(VectorType::get(UI->getType(), State.VF)); | ||
| State.set(this, Poison); | ||
| } | ||
| State.packScalarIntoVectorizedValue(this, *State.Lane); | ||
| } | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Joint proposal with @aniragil)
Replicate recipes model several aspects: (a) multiple Instruction replicas will be generated, (b) all operands will be used as scalars, and (c) the "single def" result will be provided as scalars. These affect both cost and code-gen. Taking care of (a) as a VPlan-to-VPlan transform, i.e., replicating-by-VF prior to code-gen, helps simplify the latter, including VPTranslateState. In order to take care of (b) and (c) prior to code-gen, and prior to replicating-by-VF (i.e., before committing to a single VF), could we introduce a "pack" recipe that converts a single VPValue operand which represents multiple scalars into a single def VPValue which holds them in a vector, and a converse "unpack" recipe? The former will be converted to BuildVector (or BuildStructVector) recipe by replicateByVF, and the latter to a set of VF extracts? This would help clarify which VPValues represent vectors and which hold their VF elements as scalars, thereby potentially improving cost, simplifying replicateByVF, and possibly code-gen (as done here). WDYT?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds like this should work as well.
I sketched it here: #145188. When unrolling by VF, we keep a mapping of defs -> lane definintions and use that to lower/serve Pack/Unpack VPInstructions.
It still needs some cleanup and we still have to generate some extracts.
Here's the code for Pack:51b88fa
And unpack: af03b31
And we need also need to pack replicate recipes in a vector, if they are used by replicate regions, so the VPReplicateRecipes in replicate regions can access the unrolled values: 7f53c1c
Given that this requires quite a few improvements, might be simpler to landing initial unrolling first, then the pack/unpack patches?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, very well, indeed seems simpler to move backwards by first landing unrolling and then introduce pack/unpack earlier in the pipeline. Thanks for checking!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
First step to move materializing buildvectors earlier: #151487
| VPLane(cast<ConstantInt>(getOperand(1)->getLiveInIRValue()) | ||
| ->getZExtValue())); | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed theoretically, but this deals with explicit unrolling by VF, which may be worth bounding to prevent code bloat, which may also impact performance. I.e., VF's larger than 255 could still be supported if only vectors are produced. (OTOH, legalization could also take place later...).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, with some additional comments, and separate test(s) added.
| return true; | ||
| assert(R->getNumOperands() == std::tuple_size<Ops_t>::value && | ||
| "recipe with matched opcode does not have the expected number of " | ||
| "operands"); | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Independent nit: worth noting below that "Commutative" checks operands in reverse order, which works best for binary operations.
| auto *VPI = dyn_cast<VPInstruction>(R); | ||
| if (VPI && VPI->getOpcode() == VPInstruction::BuildVector) | ||
| return true; | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| auto *VPI = dyn_cast<VPInstruction>(R); | |
| if (VPI && VPI->getOpcode() == VPInstruction::BuildVector) | |
| return true; | |
| // Finally match operands, except for BuildVector which is matched w/o checking its operands. | |
| auto *VPI = dyn_cast<VPInstruction>(R); | |
| if (VPI && VPI->getOpcode() == VPInstruction::BuildVector) | |
| return true; | 
| auto *VPI = dyn_cast<VPInstruction>(R); | ||
| if (VPI && VPI->getOpcode() == VPInstruction::BuildVector) | ||
| return true; | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better to check instead if Ops_t is empty and if so assert that R is BuildVector and early return, or set NumOperands to zero instead of R->getNumOperands() and continue?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moved up to handle the case first, thanks
| inline ZeroOpVPInstruction_match<VPInstruction::BuildVector> m_BuildVector() { | ||
| return ZeroOpVPInstruction_match<VPInstruction::BuildVector>(); | ||
| } | ||
|  | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| inline ZeroOpVPInstruction_match<VPInstruction::BuildVector> m_BuildVector() { | |
| return ZeroOpVPInstruction_match<VPInstruction::BuildVector>(); | |
| } | |
| /// BuildVector is matches only its opcode, w/o matching its operands. | |
| inline ZeroOpVPInstruction_match<VPInstruction::BuildVector> m_BuildVector() { | |
| return ZeroOpVPInstruction_match<VPInstruction::BuildVector>(); | |
| } | |
plus some explanation why - number of operands varies?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done thanks
| unsigned IdxToExtract = | ||
| cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue(); | ||
| return State.get(getOperand(0), VPLane(IdxToExtract)); | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The fact that the second operand must be constant should be documented and verified (if modeled as a general operand rather than a "flag").
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had to update the code here; there is code that may construct ExtractElement with non-constant values (early-exit with forced interleaving, where it is the first active lane)
| // Uniform within VL means we need to generate lane 0. | ||
| if (!State.Lane) { | ||
| assert(IsSingleScalar && | ||
| "VPReplicateRecipes outside replicate regions must be unrolled"); | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| "VPReplicateRecipes outside replicate regions must be unrolled"); | |
| "VPReplicateRecipes outside replicate regions must have already been unrolled"); | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated thanks
| scalarizeInstruction(UI, this, VPLane(Lane), State); | ||
| return; | ||
| assert((State.VF.isScalar() || !isSingleScalar()) && | ||
| "uniform recipe shouldn't be predicated"); | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| "uniform recipe shouldn't be predicated"); | |
| "uniform recipe shouldn't be predicated"); | |
| assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); | 
retain the assert here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done thanks
| if (State.Lane->isFirstLane()) { | ||
| assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); | ||
| WideValue = PoisonValue::get(VectorType::get(UI->getType(), State.VF)); | ||
| } else { | ||
| WideValue = State.get(this); | ||
| } | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if (State.Lane->isFirstLane()) { | |
| assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); | |
| WideValue = PoisonValue::get(VectorType::get(UI->getType(), State.VF)); | |
| } else { | |
| WideValue = State.get(this); | |
| } | |
| if (State.Lane->isFirstLane()) | |
| WideValue = PoisonValue::get(VectorType::get(UI->getType(), State.VF)); | |
| else | |
| WideValue = State.get(this); | 
drop it from here?
Can also simplify using a ternary Value *WideValue = State.Lane->isFirstLane() ? ... : ... ;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done thanks
| if (isa<StoreInst>(RepR->getUnderlyingInstr()) && | ||
| vputils::isSingleScalar(RepR->getOperand(1))) { | ||
| cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF)); | ||
| RepR->eraseFromParent(); | ||
| continue; | ||
| } | ||
|  | ||
| /// Create single-scalar version of RepR for all lanes. | ||
| for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) | ||
| LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I))); | ||
|  | ||
| if (RepR->getNumUsers() == 0) { | ||
| RepR->eraseFromParent(); | ||
| continue; | ||
| } | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps better to deal with both useless cases first:
| if (isa<StoreInst>(RepR->getUnderlyingInstr()) && | |
| vputils::isSingleScalar(RepR->getOperand(1))) { | |
| cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF)); | |
| RepR->eraseFromParent(); | |
| continue; | |
| } | |
| /// Create single-scalar version of RepR for all lanes. | |
| for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) | |
| LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I))); | |
| if (RepR->getNumUsers() == 0) { | |
| RepR->eraseFromParent(); | |
| continue; | |
| } | |
| if (RepR->getNumUsers() == 0) { | |
| if (isa<StoreInst>(RepR->getUnderlyingInstr()) && | |
| vputils::isSingleScalar(RepR->getOperand(1))) { | |
| // Stores to invariant addresses need to store the last lane only. | |
| cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF)); | |
| } else { | |
| // Create single-scalar version of RepR for all lanes. | |
| for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) | |
| cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)); | |
| } | |
| RepR->eraseFromParent(); | |
| continue; | |
| } | |
| /// Create and record single-scalar version of RepR for all lanes. | |
| SmallVector<VPValue *> LaneDefs; | |
| for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) | |
| LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I))); | |
| `` | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done thanks
| // If needed, create a Build(Struct)Vector recipe to insert the scalar | ||
| // lane values into a vector. | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So a pair of replicating recipes one feeding the other is replaced by VF recipes feeding a buildVector which VF other recipes extract from, where the extracts are optimized away by cloneForLane(); and the buildVector possibly by dce?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep
| Hi @fhahn The following starts crashing with this patch:  | 
| Thanks should be fixed by 3b7b95f | 
| 
 Yep, thanks! There seem to be more problems though: | 
| I'm also seeing failed asserts on the latest git main. Reduced reproducer: float GammaFactor_f_value;
double pow();
void malloc();
typedef struct {
  float f_black_crush;
  float f_white_crush;
  float f_gamma
} panoramix_gamma_t;
typedef struct {
  char p_lut[][1][6]
} video_splitter_sys_t;
double GammaFactor();
void Open() {
  video_splitter_sys_t *p_sys = malloc;
  panoramix_gamma_t p_gamma[5];
  p_gamma[0].f_black_crush = 255.0;
  for (int i_index2 = 0; i_index2 <= 1000; i_index2++)
    for (int i_plane = 0; i_plane < 5; i_plane++) {
      double f_factor = GammaFactor(&p_gamma[i_plane]);
      float f_lut = i_index2 * f_factor;
      p_sys->p_lut[i_plane][i_index2][0] = f_lut;
    }
}
double GammaFactor(panoramix_gamma_t *g) {
  if (GammaFactor_f_value <= g->f_black_crush)
    return pow(GammaFactor_f_value, 1.0 / g->f_gamma);
  if (g->f_white_crush)
    return pow(0, 1.0 / g->f_gamma);
}Compiled like this: $ clang -target aarch64-linux-gnu -c -O2 repro.c -fno-math-errno
clang: ../lib/Transforms/Vectorize/VPlanRecipes.cpp:2758: virtual void llvm::VPReplicateRecipe::execute(llvm::VPTransformState&): Assertion `IsSingleScalar && "VPReplicateRecipes outside replicate regions " "must have already been unrolled"' failed. | 
| Seeing similar.  | 
| @sscalpone @mstorsjo thanks for sharing the additional test cases, in some cases we cannot yet convert all hoisted recipes to single-scalar VPReplicateRecipes after hoisting. Should be fixed by making sure we also unroll all VPReplicateRecipes outside the loop region by VF: 1949536 | 
| 
 Thanks, that solved the second problem I saw as well. | 
| auto *New = | ||
| new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps, | ||
| /*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR); | ||
| New->insertBefore(RepR); | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That this failed to call transferFlags here, which resulted in miscompiles. I've created #147398 to fix.
Explicitly unroll VPReplicateRecipes outside replicate regions by VF, replacing them by VF single-scalar recipes. Extracts for operands are added as needed and the scalar results are combined to a vector using a new BuildVector VPInstruction.
It also adds a few folds to simplify unnecessary extracts/BuildVectors.
It also adds a BuildStructVector opcode for handling of calls that have struct return types.
VPReplicateRecipe in replicate regions can will be unrolled as follow up, turing non-single-scalar VPReplicateRecipes into 'abstract', i.e. not executable.